First, I get the data.
immigration_data <- read_csv('google_trends_data.csv', skip=3, col_names=c('month', 'welfare_trend', 'crime_trend', 'report_trend')) |>
mutate(month = parse_date(month, "%Y-%m")) |>
mutate(president=as.factor(ifelse(year(month) <= 2008, "Bush", ifelse(year(month) <= 2016, "Obama", "Trump"))))
## Rows: 192 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): month
## dbl (3): welfare_trend, crime_trend, report_trend
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Now, we recreate the plot.
immigration_data |>
pivot_longer(names_to = "trend_type", values_to = "trend_value", cols = ends_with("_trend")) |>
mutate(trend_type=factor(trend_type, labels=c("report_trend", "crime_trend", "welfare_trend"))) |>
ggplot(aes(x=month, y=trend_value, color=president)) +
geom_point(alpha = 0.3) +
geom_smooth(method=lm, formula = y ~ x, se=F) +
facet_wrap(~ trend_type, dir="v")
## Table 3
Now we do table 3.
table_3_data <- immigration_data |>
mutate(is_bush = president == "Bush", is_trump = president == "Trump")
models <- list(
crime = lm(crime_trend ~ month + is_bush + is_trump, data = table_3_data),
welfare = lm(welfare_trend ~ month + is_bush + is_trump, data = table_3_data),
report = lm(report_trend ~ month + is_bush + is_trump, data = table_3_data)
)
msummary(models, output='markdown')
| crime | welfare | report | |
|---|---|---|---|
| (Intercept) | -31.109 | -17.690 | 60.338 |
| (13.241) | (10.542) | (16.691) | |
| month | 0.003 | 0.002 | -0.001 |
| (0.001) | (0.001) | (0.001) | |
| is_bushTRUE | 2.836 | 1.673 | -0.748 |
| (2.388) | (1.901) | (3.010) | |
| is_trumpTRUE | 7.348 | 3.798 | 15.219 |
| (2.294) | (1.826) | (2.892) | |
| Num.Obs. | 192 | 192 | 192 |
| R2 | 0.351 | 0.265 | 0.197 |
| R2 Adj. | 0.341 | 0.253 | 0.184 |
| AIC | 1345.8 | 1258.3 | 1434.7 |
| BIC | 1362.1 | 1274.6 | 1451.0 |
| Log.Lik. | -667.905 | -624.134 | -712.370 |
| RMSE | 7.84 | 6.24 | 9.89 |
We are getting different values for the model. Not only are the constants off, but they have welfare increasing by date, while we have it decreasing. They also have big positive changes from Bush. We have small positive changes from Crime and Welfare, and by Reporting, where they have a small positive change, we have a small negative change. And they find bigger changes for Trump than we do.
But, this still supports their point from this table, which was that Trump’s election led to a large positive increase in all these searches, which we do find.
table_3_data |>
add_predictions(models[["report"]]) |>
ggplot(aes(x=month, y=report_trend, color=president)) +
geom_point(alpha = 0.3) +
geom_smooth(method=lm, formula = y ~ x, se=F) +
geom_line(aes(y=pred), linetype="dashed")
table_3_data |>
add_predictions(models[["crime"]]) |>
ggplot(aes(x=month, y=crime_trend, color=president)) +
geom_point(alpha = 0.3) +
geom_smooth(method=lm, formula = y ~ x, se=F) +
geom_line(aes(y=pred), linetype="dashed")
table_3_data |>
add_predictions(models[["welfare"]]) |>
ggplot(aes(x=month, y=welfare_trend, color=president)) +
geom_point(alpha = 0.3) +
geom_smooth(method=lm, formula = y ~ x, se=F) +
geom_line(aes(y=pred), linetype="dashed")
This looks very different from out original plot, but that makes sense.
Before, geom_smooth was using * to have different slopes for the
different presidencies. Since here, we only have a constant based on who
was president, we have one slope for everything.
Now, we get the topic model data.
load("TopicModel.RData")
document_topics <- make.dt(immigrFit, meta = out$meta)
topic_terms <- t(exp(immigrFit$beta$logbeta[[1]]))
rownames(topic_terms) <- out$vocab
colnames(topic_terms) <- sprintf("Topic%d", 1:ncol(topic_terms))
document_topics <- document_topics |>
mutate(date = ymd(date))
First, we make figure 2. It is about immigration coverage in general, not of a specific category, so I think all the documents there count.
campaign_start <- document_topics |>
filter(time == "pre-election") |>
arrange(date) |>
slice_tail(n=1) |>
pull(date)
campaign_end <- document_topics |>
filter(time == "post-election") |>
arrange(date) |>
slice_head(n=1) |>
pull(date)
document_topics |>
mutate(month = floor_date(date, "month")) |>
group_by(month, channel, time) |>
summarize(num_segments = n()) |>
ggplot(aes(x=month, y=num_segments, color=channel)) +
geom_point() +
geom_smooth(aes(group = interaction(channel, time)), se=F) +
geom_vline(aes(xintercept=campaign_start), linetype="dashed") +
geom_vline(aes(xintercept=campaign_end), linetype="dashed") +
scale_color_manual(values=c("magenta", "red", "blue"))
## `summarise()` has grouped output by 'month', 'channel'. You can override using
## the `.groups` argument.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
We have essentially the same results as they do, but with a less severe
jump after the campaign start and a bigger drop before the inauguration.
But their overall findings still show.
document_topics |>
mutate(month = floor_date(date, "month")) |>
group_by(month, channel, time) |>
summarize(proportion_1 = sum(Topic1), proportion_3 = sum(Topic3), crime_coverage = proportion_1 + proportion_3) |>
ggplot(aes(x=month, y=crime_coverage, color=channel)) +
geom_point() +
geom_smooth(aes(group = interaction(channel, time)), se=F) +
geom_vline(aes(xintercept=campaign_start), linetype="dashed") +
geom_vline(aes(xintercept=campaign_end), linetype="dashed") +
scale_color_manual(values=c("magenta", "red", "blue"))
## `summarise()` has grouped output by 'month', 'channel'. You can override using
## the `.groups` argument.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
document_topics |>
mutate(month = floor_date(date, "month")) |>
group_by(month, channel, time) |>
summarize(welfare_coverage = sum(Topic13)) |>
ggplot(aes(x=month, y=welfare_coverage, color=channel)) +
geom_point() +
geom_smooth(aes(group = interaction(channel, time)), se=F) +
geom_vline(aes(xintercept=campaign_start), linetype="dashed") +
geom_vline(aes(xintercept=campaign_end), linetype="dashed") +
scale_color_manual(values=c("magenta", "red", "blue"))
## `summarise()` has grouped output by 'month', 'channel'. You can override using
## the `.groups` argument.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
This time, we get identical results. I suspect that in the original, we
only differ because of how we deal with the months bifrucated by the
dotted lines. We were consistent, always treating them as multiple
points, causing the lines to cross the dotted lines. They only seem to
have done that for the second one for some reason.
Now we reproduce table 4. This is before we get the daily data, so we do it on a monthly level.
topic_table <- document_topics |>
mutate(month = floor_date(date, "month")) |>
group_by(month, channel, time) |>
summarize(proportion_1 = sum(Topic1), proportion_3 = sum(Topic3), crime_coverage = proportion_1 + proportion_3, welfare_coverage = sum(Topic13), num_segments = n()) |>
mutate(trump_admin = time == "post-election", month_of_year=month(month, label=T)) |>
inner_join(immigration_data, by="month")
## `summarise()` has grouped output by 'month', 'channel'. You can override using
## the `.groups` argument.
lm(report_trend ~ num_segments + crime_coverage + welfare_coverage + trump_admin + month + month_of_year, topic_table) |>
msummary(output='markdown')
| (1) | |
|---|---|
| (Intercept) | 25.700 |
| (31.861) | |
| num_segments | 0.015 |
| (0.004) | |
| crime_coverage | 0.010 |
| (0.050) | |
| welfare_coverage | -0.036 |
| (0.143) | |
| trump_adminTRUE | 14.941 |
| (2.210) | |
| month | 0.001 |
| (0.002) | |
| month_of_year.L | -12.171 |
| (2.006) | |
| month_of_year.Q | 9.143 |
| (1.911) | |
| month_of_year.C | 0.275 |
| (1.955) | |
| month_of_year^4 | 1.986 |
| (1.958) | |
| month_of_year^5 | 2.206 |
| (1.982) | |
| month_of_year^6 | -4.164 |
| (1.991) | |
| month_of_year^7 | 2.492 |
| (1.995) | |
| month_of_year^8 | 0.790 |
| (1.972) | |
| month_of_year^9 | -0.473 |
| (1.937) | |
| month_of_year^10 | -3.109 |
| (1.911) | |
| month_of_year^11 | 1.139 |
| (1.891) | |
| Num.Obs. | 202 |
| R2 | 0.677 |
| R2 Adj. | 0.649 |
| AIC | 1424.9 |
| BIC | 1484.4 |
| Log.Lik. | -694.433 |
| RMSE | 7.53 |
Obviously, this is different from their result, because it is on a monthly level. But we found significantly lower effects of number of segments and crime coverage. These things shouldn’t change, but perhaps the effect only exists in the short-term (but if so, that changes the paper’s conclusions). Though we did still have a high modifier for Trump’s presidency, persumably because that was a long-term effect.
We want to recreate figure 4 and table 3 using the data they used to see if we still get the same results.
report_data <- read_csv('from_replication_files/google_trends_report.csv') |>
rename(report_trend = search)
## Rows: 190 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): month
## dbl (2): year, search
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
crime_data <- read_csv('from_replication_files/google_trends_crime.csv') |>
rename(crime_trend = search)
## Rows: 190 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): month
## dbl (2): year, search
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
welfare_data <- read_csv('from_replication_files/google_trends_welfare.csv') |>
rename(welfare_trend = search)
## Rows: 190 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): month
## dbl (2): year, search
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
trend_data_given <- report_data |>
inner_join(crime_data, by=c("year", "month")) |>
inner_join(welfare_data, by=c("year", "month")) |>
mutate(month = paste(year, month), month = parse_date(month, "%Y %m")) |>
mutate(president=as.factor(ifelse(year(month) <= 2008, "Bush", ifelse(year(month) <= 2016, "Obama", "Trump"))))
trend_data_given |>
pivot_longer(names_to = "trend_type", values_to = "trend_value", cols = ends_with("_trend")) |>
mutate(trend_type=factor(trend_type, labels=c("report_trend", "crime_trend", "welfare_trend"))) |>
ggplot(aes(x=month, y=trend_value, color=president)) +
geom_point(alpha = 0.3) +
geom_smooth(method=lm, formula = y ~ x, se=F) +
facet_wrap(~ trend_type, dir="v")
table_3_data_given <- trend_data_given |>
mutate(is_bush = president == "Bush", is_trump = president == "Trump")
list(
crime = lm(crime_trend ~ month + is_bush + is_trump, data = table_3_data_given),
welfare = lm(welfare_trend ~ month + is_bush + is_trump, data = table_3_data_given),
report = lm(report_trend ~ month + is_bush + is_trump, data = table_3_data_given)
) |> msummary(output='markdown')
| crime | welfare | report | |
|---|---|---|---|
| (Intercept) | -4.163 | 28.585 | 72.574 |
| (23.241) | (20.710) | (16.089) | |
| month | 0.001 | 0.000 | -0.002 |
| (0.001) | (0.001) | (0.001) | |
| is_bushTRUE | 8.280 | 7.819 | 0.785 |
| (4.187) | (3.731) | (2.899) | |
| is_trumpTRUE | 19.903 | 18.560 | 19.716 |
| (4.027) | (3.588) | (2.788) | |
| Num.Obs. | 190 | 190 | 190 |
| R2 | 0.262 | 0.237 | 0.286 |
| R2 Adj. | 0.250 | 0.225 | 0.275 |
| AIC | 1544.4 | 1500.6 | 1404.7 |
| BIC | 1560.7 | 1516.8 | 1420.9 |
| Log.Lik. | -767.216 | -745.305 | -697.339 |
| RMSE | 13.72 | 12.23 | 9.50 |
With this, we get the same data as he does. We still don’t get quite the same regression, but we get closer. Interestingly, a lot of times, they have non-zero trends when we have zero trends.
But it’s also true that he downloaded this in 3 separate files while we did it in a single file. Since Google measures things relative to each other when you download them together, it’s possible that this changes things.
Now we will replicate Table 4 using the given day values.
given_daily_data <- read_csv('from_replication_files/gt_report_daily.csv')
## New names:
## Rows: 5751 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (3): ...1, search, search_adj date (1): date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
topic_table_daily <- document_topics |>
group_by(date, channel, time) |>
summarize(proportion_1 = sum(Topic1), proportion_3 = sum(Topic3), crime_coverage = proportion_1 + proportion_3, welfare_coverage = sum(Topic13), num_segments = n()) |>
mutate(trump_admin = time == "post-election", month_of_year=month(date, label=T), day_of_week=wday(date, label=T)) |>
inner_join(given_daily_data, by="date")
## `summarise()` has grouped output by 'date', 'channel'. You can override using
## the `.groups` argument.
list(
search_model=lm(search ~ num_segments + crime_coverage + welfare_coverage + trump_admin + date + day_of_week + month_of_year, topic_table_daily),
search_adj_model=lm(search_adj ~ num_segments + crime_coverage + welfare_coverage + trump_admin + date + day_of_week + month_of_year, topic_table_daily)
)|>
msummary(output='markdown')
| search_model | search_adj_model | |
|---|---|---|
| (Intercept) | 117.614 | 93.523 |
| (15.362) | (15.953) | |
| num_segments | 0.141 | 0.203 |
| (0.026) | (0.027) | |
| crime_coverage | -0.431 | 0.277 |
| (0.313) | (0.325) | |
| welfare_coverage | 0.954 | 0.934 |
| (0.783) | (0.814) | |
| trump_adminTRUE | 9.146 | 20.280 |
| (1.048) | (1.088) | |
| date | -0.005 | -0.003 |
| (0.001) | (0.001) | |
| day_of_week.L | 1.176 | 1.579 |
| (0.692) | (0.719) | |
| day_of_week.Q | -7.476 | -7.231 |
| (0.694) | (0.720) | |
| day_of_week.C | -0.020 | -0.447 |
| (0.690) | (0.716) | |
| day_of_week^4 | -0.689 | -0.952 |
| (0.688) | (0.714) | |
| day_of_week^5 | 3.147 | 2.744 |
| (0.688) | (0.714) | |
| day_of_week^6 | -1.341 | -1.567 |
| (0.686) | (0.712) | |
| month_of_year.L | 6.419 | -11.125 |
| (0.941) | (0.977) | |
| month_of_year.Q | 3.229 | 8.303 |
| (0.902) | (0.937) | |
| month_of_year.C | 1.407 | -2.957 |
| (0.923) | (0.958) | |
| month_of_year^4 | 4.613 | 4.193 |
| (0.918) | (0.953) | |
| month_of_year^5 | 0.408 | 2.869 |
| (0.910) | (0.945) | |
| month_of_year^6 | -4.469 | -6.073 |
| (0.920) | (0.955) | |
| month_of_year^7 | 2.725 | 1.715 |
| (0.913) | (0.948) | |
| month_of_year^8 | -2.054 | 0.035 |
| (0.916) | (0.951) | |
| month_of_year^9 | 2.672 | -1.323 |
| (0.917) | (0.953) | |
| month_of_year^10 | -3.272 | -1.169 |
| (0.896) | (0.931) | |
| month_of_year^11 | 0.567 | 1.562 |
| (0.878) | (0.912) | |
| Num.Obs. | 5053 | 5053 |
| R2 | 0.089 | 0.268 |
| R2 Adj. | 0.085 | 0.265 |
| AIC | 43850.3 | 44231.8 |
| BIC | 44006.9 | 44388.4 |
| Log.Lik. | -21901.136 | -22091.890 |
| RMSE | 18.46 | 19.17 |
From here, we see that while neither of them are exactly identical, the one using the adjusted values is much closer to what they got. But the non-adjusted version actually goes against it, with a greater number of searches with less crime coverage.
Looking at the data, it seems that the problem is that they can only look at one time period (say, month) at a time, and each month is only scaled with other searches in that month. But, I don’t understand how their normalization works.
We noticed that they got each prompt separately. Since Google Trends weights everything you are comparing to the max of the other, if we want to replicate their results, then we need to get them separately like what the researchers seem to have done.
report_data <- read_csv('from_google_trends/report_trend_individual.csv', skip=3, col_names=c('month', 'report_trend'))
## Rows: 192 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): month
## dbl (1): report_trend
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
crime_data <- read_csv('from_google_trends/crime_trend_individual.csv', skip=3, col_names=c('month', 'crime_trend'))
## Rows: 192 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): month
## dbl (1): crime_trend
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
welfare_data <- read_csv('from_google_trends/welfare_trend_individual.csv', skip=3, col_names=c('month', 'welfare_trend'))
## Rows: 192 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): month
## dbl (1): welfare_trend
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
trend_data_individual <- report_data |>
inner_join(crime_data, by=c("month")) |>
inner_join(welfare_data, by=c("month")) |>
mutate(month = parse_date(month, "%Y-%m")) |>
mutate(president=as.factor(ifelse(year(month) <= 2008, "Bush", ifelse(year(month) <= 2016, "Obama", "Trump"))))
trend_data_individual |>
pivot_longer(names_to = "trend_type", values_to = "trend_value", cols = ends_with("_trend")) |>
mutate(trend_type=factor(trend_type, labels=c("report_trend", "crime_trend", "welfare_trend"))) |>
ggplot(aes(x=month, y=trend_value, color=president)) +
geom_point(alpha = 0.3) +
geom_smooth(method=lm, formula = y ~ x, se=F) +
facet_wrap(~ trend_type, dir="v")
table_3_data_individual <- trend_data_individual |>
mutate(is_bush = president == "Bush", is_trump = president == "Trump")
list(
crime = lm(crime_trend ~ month + is_bush + is_trump, data = table_3_data_individual),
welfare = lm(welfare_trend ~ month + is_bush + is_trump, data = table_3_data_individual),
report = lm(report_trend ~ month + is_bush + is_trump, data = table_3_data_individual)
) |> msummary(output='markdown')
| crime | welfare | report | |
|---|---|---|---|
| (Intercept) | -57.817 | -20.711 | 68.858 |
| (23.105) | (27.392) | (17.014) | |
| month | 0.005 | 0.003 | -0.002 |
| (0.001) | (0.002) | (0.001) | |
| is_bushTRUE | 6.951 | 4.410 | -1.618 |
| (4.166) | (4.939) | (3.068) | |
| is_trumpTRUE | 12.422 | 12.235 | 16.077 |
| (4.003) | (4.746) | (2.948) | |
| Num.Obs. | 192 | 192 | 192 |
| R2 | 0.339 | 0.190 | 0.190 |
| R2 Adj. | 0.329 | 0.177 | 0.178 |
| AIC | 1559.6 | 1625.0 | 1442.1 |
| BIC | 1575.9 | 1641.2 | 1458.4 |
| Log.Lik. | -774.798 | -807.478 | -716.049 |
| RMSE | 13.69 | 16.23 | 10.08 |
But the paper still has welfare searches going up in Trump’s presidency, while we have it going down slightly. And in the table, they have welfare going down over time, while we have it going up. So, this discrepency doesn’t explain anything.